# -*- coding: utf-8 -*-

import os, jieba, logging
from gensim import corpora, models, similarities
import numpy as np

class Similarity(object):
	def __init__(self, file_dir, stopword_dir):
		super(Similarity, self).__init__()

		self.texts = []
		self.filenames = []
		for file in os.listdir(file_dir):
			self.filenames.append(file)
			with open(file_dir + file) as f:
				self.texts.append(f.read())

		with open(stopword_dir) as f:
			self.stopwords = [line.strip() for line in f.readlines()]

		# jieba.enable_paddle()
		
	def run(self, target):
		# 将[文本集]生成[分词列表]
		split_texts = [self.cut(text) for text in self.texts]

		# 基于文本集建立[词典](单词与编号之间的映射)
		dictionary = corpora.Dictionary(split_texts)

		# 基于词典，将[分词列表集]转换成[稀疏向量集]，即[语料库]
		corpus = [dictionary.doc2bow(text) for text in split_texts]

		# 创建[TF-IDF模型]，传入[语料库]来训练
		tfidf = models.TfidfModel(corpus)

		# 用训练好的[TF-IDF模型]处理[被检索文本]和[搜索文本]
		tfidf_corpus = tfidf[corpus]

		# 相似度计算，建立索引
		index = similarities.MatrixSimilarity(tfidf_corpus)

		logging.info('**************************************************************')
		logging.info('Search text:\n%s\n' % target)
		target_vector = dictionary.doc2bow(self.cut(target))
		tfidf_target = tfidf[target_vector]

		sim = index[tfidf_target]
		logging.info('Similarity result:\n%s\n' % sim)

		# Find max
		mx = sim.max()
		wh = np.where(sim == mx)
		# print(type(wh), wh, type(wh[0][0]), wh[0][0], type(mx), mx)
		logging.info('Maximum similarity: [%s %s]' % (self.filenames[wh[0][0]], mx))

		# logging.info('Sorting result:')
		mp = self.mapping(sim)
		# for x in mp: print(x['key'], x['value'])

		return mp

	def cut(self, txt):
		result = []
		for word in jieba.cut(txt):
			if word not in self.stopwords:
				result.append(word)

		return result

	def mapping(self, sim):
		result = []
		fl = len(self.filenames)
		for ind in range(fl):
			result.append({
				'key': self.filenames[ind],
				'value': sim[ind]
			})

		return sorted(result, key = lambda x:x['value'], reverse = True)
